suppressPackageStartupMessages({
library(epiwraps)
library(rtracklayer)
library(GenomicRanges)
library(ggplot2)
library(ComplexHeatmap)
library(chromVAR)
library(motifmatchr)
library(memes)
library(MotifDb)
library(universalmotif)
library(TFBSTools)
library(AnnotationHub)
library(RColorBrewer)
library(viridis)
library(viridisLite)
library(ggrepel)
library(magick)
library(ggpubr)
library(cowplot)
})
## See system.file("LICENSE", package="MotifDb") for use restrictions.
theme_set(theme_bw())
set.seed(123)
load signal files load Veh track for NF,Mono and Full; and give names
bw_ATAC_mono <- list.files(path = "../Dex_Veh/tracks/", pattern ="*Veh.(.*)mono.bw",full.names = TRUE )
bw_ATAC_nf <- list.files(path = "../Dex_Veh/tracks/", pattern ="*Veh.(.*)NF.bw",full.names = TRUE )
bw_ATAC_full <- list.files(path = "../Dex_Veh/tracks/", pattern ="*Veh.(.*)full.bw",full.names = TRUE )
names(bw_ATAC_mono) <- c("ATAC_Mono_1","ATAC_Mono_2")
names(bw_ATAC_nf) <- c("ATAC_NF_1","ATAC_NF_2")
names(bw_ATAC_full) <- c("ATAC_Full_1","ATAC_Full_2")
import 5fC sites and getting unique 5fC and no5fC sites
sites_5fC <- readRDS("../object_resources/5fC/sites_5fC.rds")
sites_no5fC <- readRDS("../object_resources/5fC/sites_no5fC.rds")
load TSS with and without 5fC
TSS_2KB_5fC <- readRDS("../object_resources/5fC/TSS_2KB_5fC.rds")
TSS_2KB_no5fC <- readRDS("../object_resources/5fC/TSS_2KB_no5fC.rds")
generation of normalization factors
bg_nf <- bwNormFactors(c(bw_ATAC_nf,bw_ATAC_mono,bw_ATAC_full), method = "background")
## Warning: bwNormFactors is deprecated, please gradually switch to
## `getNormFactors`.
## Comparing coverage in random regions...
sample no5fC regions to match
sampled_no5fC <- TSS_2KB_no5fC[sample(c(1:length(TSS_2KB_no5fC)),size = length(TSS_2KB_5fC))]
list_5fC_no5fC <- unlist(GRangesList(TSS_2KB_5fC,sampled_no5fC))
cluster regions 5fC top and no5fC bottom
# cluster regions 5fC top and no5fC bottom
i <- integer(length(list_5fC_no5fC))
i[1:(length(list_5fC_no5fC)/2)] <- "TSS with 5fC"
i[((length(list_5fC_no5fC)/2)+1):length(list_5fC_no5fC)] <- "TSS with no 5fC"
saveRDS(i,"../object_resources/figure_1_meta/cluster_5fC_no5fC.LIST.rds")
mycolors <- c("TSS with 5fC"=brewer.pal(n=9,"Set1")[1], "TSS with no 5fC"=brewer.pal(n=9,"Set1")[2])
m_5fC_no5fC <- signal2Matrix(c(bw_ATAC_nf,bw_ATAC_mono,bw_ATAC_full),w = 2L, regions = list_5fC_no5fC,type = "center")
## Reading ../Dex_Veh/tracks//Veh4.NF.bw
## Reading ../Dex_Veh/tracks//Veh5.NF.bw
## Reading ../Dex_Veh/tracks//Veh4.mono.bw
## Reading ../Dex_Veh/tracks//Veh5.mono.bw
## Reading ../Dex_Veh/tracks//Veh4.full.bw
## Reading ../Dex_Veh/tracks//Veh5.full.bw
m_5fC_no5fC_bg <- rescaleSignalMatrices(m_5fC_no5fC,bg_nf)
#merging replicates in the signal matrices
sm2 <- list( Nuc.Free =mergeSignalMatrices(m_5fC_no5fC_bg[1:2],aggregation = "mean"), Mononucleosome=mergeSignalMatrices(m_5fC_no5fC_bg[3:4],aggregation = "mean"), Full=mergeSignalMatrices(m_5fC_no5fC_bg[5:6],aggregation = "mean") )
saveRDS(sm2,"../object_resources/figure_1_meta/matrix_mm.rds")
m_mm <- readRDS("../object_resources/figure_1_meta/matrix_mm.rds")
mycolors <- c("TSS with 5fC"=brewer.pal(n=9,"Set1")[1], "TSS with no 5fC"=brewer.pal(n=9,"Set1")[2])
i <- readRDS("../object_resources/figure_1_meta/cluster_5fC_no5fC.LIST.rds")
p1 <- epiwraps::plotEnrichedHeatmaps((m_mm), row_split = i,scale_title = "Backg.\nnorm.\ndensity", axis_name = c("-2kb","TSS","+2kb"), colors = rocket(100), mean_color = mycolors, mean_scale_side = "left", top_annotation = FALSE,use_raster = TRUE)
p1
saveRDS(p1,"../object_resources/figure_1_meta/p1.rds")
generate individual matrices for 5fC and no5fC
# with 5fC
m_5fC <- signal2Matrix(c(bw_ATAC_nf,bw_ATAC_mono,bw_ATAC_full),w = 10L, regions = TSS_2KB_5fC)
## Reading ../Dex_Veh/tracks//Veh4.NF.bw
## Reading ../Dex_Veh/tracks//Veh5.NF.bw
## Reading ../Dex_Veh/tracks//Veh4.mono.bw
## Reading ../Dex_Veh/tracks//Veh5.mono.bw
## Reading ../Dex_Veh/tracks//Veh4.full.bw
## Reading ../Dex_Veh/tracks//Veh5.full.bw
m_5fC_bg <- rescaleSignalMatrices(m_5fC,bg_nf)
# without 5fC
m_no5fC <- signal2Matrix(c(bw_ATAC_nf,bw_ATAC_mono,bw_ATAC_full),w = 10L, regions = TSS_2KB_no5fC)
## Reading ../Dex_Veh/tracks//Veh4.NF.bw
## Reading ../Dex_Veh/tracks//Veh5.NF.bw
## Reading ../Dex_Veh/tracks//Veh4.mono.bw
## Reading ../Dex_Veh/tracks//Veh5.mono.bw
## Reading ../Dex_Veh/tracks//Veh4.full.bw
## Reading ../Dex_Veh/tracks//Veh5.full.bw
m_no5fC_bg <- rescaleSignalMatrices(m_no5fC,bg_nf)
aggregate signal
# with the matrices we generate dataframes with mean signal per position
d_5fC <- meltSignals(m_5fC_bg,trim = c(0.02,0.98))
d_no5fC <- meltSignals(m_no5fC_bg,trim = c(0.02,0.98))
# we merge both dataframes
dmerge <- dplyr::bind_rows(list("5fC"=d_5fC, "no5fC"=d_no5fC), .id="regions")
dmerge
saveRDS(dmerge,"../object_resources/figure_1_meta/dmerge_1.rds")
# we load the merged dataframe
dmerge <- readRDS("dmerge_1.rds")
# we plot the aggregate signals by region and fragment size distribution
p2 <- plot_grid(
ggplot(dmerge[dmerge$sample == "ATAC_NF_1"|dmerge$sample == "ATAC_NF_2",], aes(position, mean, colour=regions, group = regions, fill = regions)) +
geom_vline(xintercept=0, linetype="dashed") +
labs(x="TSS", y="mean background norm. density")+
stat_summary(geom="ribbon", alpha=0.3, colour = NA ) + stat_summary(fun=mean, geom="line", size = 0.6)+
ggtitle("Nuc. Free")+ scale_color_brewer(palette="Set1")+scale_fill_brewer(palette="Set1")+ylim(0,0.1)+theme(text = element_text(size = 10),legend.position = "none"),
ggplot(dmerge[dmerge$sample == "ATAC_Mono_1"|dmerge$sample == "ATAC_Mono_2",], aes(position, mean, colour=regions, group = regions, fill = regions)) +
geom_vline(xintercept=0, linetype="dashed") +
stat_summary(geom="ribbon", alpha=0.3, colour = NA ) + stat_summary(fun=mean, geom="line", size = 0.6)+
labs(x="TSS")+
ggtitle("Mononucleosome")+ scale_color_brewer(palette="Set1")+scale_fill_brewer(palette="Set1")+ylim(0,0.1)+theme(text = element_text(size = 10),axis.title.y = element_blank(),legend.position = "none",axis.text.y = element_blank()),
ggplot(dmerge[dmerge$sample == "ATAC_Full_1"|dmerge$sample == "ATAC_Full_2",], aes(position, mean, colour=regions, group = regions, fill = regions)) +
geom_vline(xintercept=0, linetype="dashed") +
stat_summary(geom="ribbon", alpha=0.3, colour = NA ) + stat_summary(fun=mean, geom="line", size = 0.6)+
xlab("TSS")+
ggtitle("Full")+ scale_color_brewer(palette="Set1")+scale_fill_brewer(palette="Set1")+ylim(0,0.1)+theme(text = element_text(size = 10),axis.title.y = element_blank(),axis.text.y = element_blank()),
ncol = 3, nrow = 1,rel_widths = c(1.15,1,1.2)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
## No summary function supplied, defaulting to `mean_se()`
p2
saveRDS(p2,"../object_resources/figure_1_meta/p2.rds")
previous data was generated with methylation data from an unpublished source therefore it is now repeated with published work (rando)
import unique 5fC sites in proximity to TSS
unique_5fC_sites_in_TSS <- readRDS("../object_resources/5fC/unique_5fC_sites_in_TSS.rds")
import genome fasta file (GRCm38)
genome <- Rsamtools::FaFile("../object_resources/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa")
genome
## class: FaFile
## path: ../object_resources/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa
## index: ../object_resources.../Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.fai
## gzindex: ../object_resourc.../Mus_musculus.GRCm38.dna_sm.primary_assembly.fa.gzi
## isOpen: FALSE
## yieldSize: NA
loading Rando data methylation levels at promoters and perform liftover (PL)
methylation_mm9 <- readRDS("../object_resources/GSE75323_sperm_prom_methylation.mm9.rds")
methylation_mm9 <- GRanges(methylation_mm9)
seqlevelsStyle(methylation_mm9)
## [1] "UCSC"
mm9_mm10_chain <- import.chain("../object_resources//mm9ToMm10.over.chain")
methylation_total <- liftOver(methylation_mm9,chain = mm9_mm10_chain)
methylation_total <- unlist(methylation_total)
seqlevelsStyle(methylation_total) <- "Ensembl"
saveRDS(methylation_total,"../object_resources/methylation_total_grcm38.GR.rds")
load GRCm38 sperm methylation data and check for promoters with methylation score >0.5
#methylation_total <- readRDS("../object_resources/methylation_rando_grcm38.GR.rds")
methyl_promoters <- methylation_total[methylation_total$meanMeth>0.5]
load motifs and check select non-redundant TFs
core <- read_meme(file = "../object_resources/HOCOMOCOv11_core_MOUSE_mono_meme_format.meme") # needs to be the right path
#full <- read_meme(file = "HOCOMOCOv11_full_MOUSE_mono_meme_format.meme")
# original function PL
summary_core <- universalmotif::summarise_motifs(core)
core <- setNames(object = core,nm = summary_core$name)
motifs <- core
modf <- data.frame(row.names=summary_core$name,
TF=gsub("_MOUSE.H11MO...+","",summary_core$name),
grade=gsub(".+\\.","",summary_core$name),
motif_rank = gsub("[^01]+","",gsub(".*MO.",replacement = "",summary_core$name))) #gsub("[^ro]+", "", str1)
modf <- modf[order(modf$TF,modf$grade,modf$motif_rank),]
modf <- modf[!duplicated(modf$TF),]
modf <- modf[!modf$grade == "D",]
motifs <- setNames(universalmotif::convert_motifs(motifs[row.names(modf)]), modf$TF)
peak_centers <- resize(unique_5fC_sites_in_TSS,fix = "center",width = 200)
peak_seqs <- memes::get_sequence(dropSeqlevels(peak_centers,"MT",pruning.mode = "coarse"), genome)# dropping MT due to its absence in reference genome file
peak_centers <- reduce(promoters(methyl_promoters, upstream = 1000, downstream = 100))
peak_seqs_methyl_prom <- memes::get_sequence(dropSeqlevels(peak_centers,"MT",pruning.mode = "coarse"), genome)# dropping MT due to its absence in reference genome file
ame <- memes::runAme(peak_seqs,control = peak_seqs_methyl_prom , database=convert_motifs(motifs,class = "universalmotif-universalmotif"), meme_path="/common/meme/bin/")
head(ame)
ame$motif <- basename(ame$motif_db)
saveRDS(ame,"../object_resources/figure_1_meta/motif_enrichment_5fC_5mC_promoter.rds")
load sample
#ame<- readRDS("../object_resources/figure_1_meta/motif_enrichment_rando.rds")
ame <- ame[!ame$fp==0,] #TFs with no false positive are removed since no fold change can be calculated
# results with no false positive were removed
p3 <- ggplot(ame, aes(log2(((tp/pos))/((fp/neg))), -log10(adj.pvalue))) +
geom_point(alpha=0.7, size =3) + geom_text_repel(data=head((ame),9), aes(label=motif),max.overlaps = 20, size = 5) +
labs(x="log2FE")+ xlim(c(0,6.5))+theme(text = element_text(size = 12))#+ ggtitle("5fC sites in TSS proximity compared to methylated promoter regions")
p3
#plot_grid(grid.grabExpr(ComplexHeatmap::draw(p1)), nrow=1, rel_widths=c(5,3))
pp <- plot_grid(plot_grid(grid.grabExpr(ComplexHeatmap::draw(p1)),p3,labels = c("A","C"),rel_widths = c(3,2)),p2,labels = c(NA,"B"),nrow = 2,rel_heights = c(3,2))
pp
## Warning: Removed 1 rows containing missing values (`geom_text()`).
# now add the title
title <- ggdraw() +
draw_label(
"Figure 1",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
pp2 <- plot_grid(
title, pp,
ncol = 1,
# rel_heights values control vertical title margins
rel_heights = c(0.1, 2)
)
## Warning: Removed 1 rows containing missing values (`geom_text()`).
pp2
pdf("figure_1_meta.pdf", width=10, height=7.5)
pp2
dev.off()
## png
## 2
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] cowplot_1.1.1 ggpubr_0.4.0 magick_2.7.3
## [4] ggrepel_0.9.3 viridis_0.6.2 viridisLite_0.4.1
## [7] RColorBrewer_1.1-3 AnnotationHub_2.22.1 BiocFileCache_1.14.0
## [10] dbplyr_2.1.1 TFBSTools_1.28.0 universalmotif_1.8.5
## [13] MotifDb_1.32.0 Biostrings_2.58.0 XVector_0.30.0
## [16] memes_0.99.11 motifmatchr_1.12.0 chromVAR_1.12.0
## [19] ggplot2_3.4.2 rtracklayer_1.50.0 epiwraps_0.99.62
## [22] EnrichedHeatmap_1.29.2 ComplexHeatmap_2.15.1 GenomicRanges_1.42.0
## [25] GenomeInfoDb_1.26.7 IRanges_2.24.1 S4Vectors_0.28.1
## [28] BiocGenerics_0.36.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 R.methodsS3_1.8.1
## [3] tidyr_1.2.0 bit64_4.0.5
## [5] knitr_1.42 DelayedArray_0.16.3
## [7] R.utils_2.11.0 data.table_1.14.8
## [9] rpart_4.1.16 KEGGREST_1.30.1
## [11] RCurl_1.98-1.6 AnnotationFilter_1.14.0
## [13] doParallel_1.0.17 generics_0.1.3
## [15] GenomicFeatures_1.42.3 RSQLite_2.2.11
## [17] bit_4.0.5 tzdb_0.3.0
## [19] xml2_1.3.3 httpuv_1.6.9
## [21] SummarizedExperiment_1.20.0 assertthat_0.2.1
## [23] DirichletMultinomial_1.32.0 xfun_0.38
## [25] hms_1.1.1 jquerylib_0.1.4
## [27] evaluate_0.20 promises_1.2.0.1
## [29] fansi_1.0.4 progress_1.2.2
## [31] caTools_1.18.2 DBI_1.1.3
## [33] htmlwidgets_1.5.4 purrr_1.0.1
## [35] ellipsis_0.3.2 dplyr_1.1.1
## [37] backports_1.4.1 annotate_1.68.0
## [39] biomaRt_2.46.3 MatrixGenerics_1.2.1
## [41] vctrs_0.6.1 Biobase_2.50.0
## [43] ensembldb_2.14.1 Cairo_1.6-0
## [45] abind_1.4-5 cachem_1.0.7
## [47] withr_2.5.0 Gviz_1.34.1
## [49] BSgenome_1.58.0 vroom_1.5.7
## [51] checkmate_2.1.0 GenomicAlignments_1.26.0
## [53] prettyunits_1.1.1 cluster_2.1.4
## [55] lazyeval_0.2.2 seqLogo_1.56.0
## [57] crayon_1.5.2 edgeR_3.32.1
## [59] pkgconfig_2.0.3 labeling_0.4.2
## [61] pkgload_1.3.2 ProtGenerics_1.22.0
## [63] nnet_7.3-17 rlang_1.1.0
## [65] lifecycle_1.0.3 miniUI_0.1.1.1
## [67] dichromat_2.0-0.1 rprojroot_2.0.3
## [69] matrixStats_0.63.0 Matrix_1.5-3
## [71] ggseqlogo_0.1 carData_3.0-5
## [73] base64enc_0.1-3 processx_3.5.3
## [75] GlobalOptions_0.1.2 png_0.1-7
## [77] rjson_0.2.21 bitops_1.0-7
## [79] splitstackshape_1.4.8 cmdfun_1.0.2
## [81] R.oo_1.24.0 blob_1.2.2
## [83] shape_1.4.6 stringr_1.5.0
## [85] readr_2.1.2 jpeg_0.1-10
## [87] rstatix_0.7.0 ggsignif_0.6.4
## [89] CNEr_1.26.0 scales_1.2.1
## [91] memoise_2.0.1 magrittr_2.0.3
## [93] plyr_1.8.8 zlibbioc_1.36.0
## [95] compiler_4.0.3 clue_0.3-60
## [97] Rsamtools_2.6.0 cli_3.6.1
## [99] ps_1.6.0 htmlTable_2.4.0
## [101] Formula_1.2-5 MASS_7.3-56
## [103] tidyselect_1.2.0 stringi_1.7.12
## [105] highr_0.10 yaml_2.3.7
## [107] askpass_1.1 locfit_1.5-9.4
## [109] latticeExtra_0.6-29 sass_0.4.1
## [111] VariantAnnotation_1.36.0 tools_4.0.3
## [113] circlize_0.4.15 rstudioapi_0.13
## [115] TFMPvalue_0.0.8 foreach_1.5.2
## [117] foreign_0.8-84 gridExtra_2.3
## [119] farver_2.1.1 digest_0.6.31
## [121] BiocManager_1.30.20 shiny_1.7.1
## [123] pracma_2.3.8 Rcpp_1.0.10
## [125] car_3.0-12 broom_0.7.12
## [127] BiocVersion_3.12.0 later_1.3.0
## [129] httr_1.4.2 AnnotationDbi_1.52.0
## [131] biovizBase_1.38.0 Rdpack_2.3
## [133] colorspace_2.1-0 brio_1.1.3
## [135] XML_3.99-0.9 splines_4.0.3
## [137] plotly_4.10.0 xtable_1.8-4
## [139] jsonlite_1.8.4 poweRlaw_0.70.6
## [141] GenomicFiles_1.26.0 UpSetR_1.4.0
## [143] testthat_3.1.2 R6_2.5.1
## [145] Hmisc_4.6-0 pillar_1.9.0
## [147] htmltools_0.5.5 mime_0.12
## [149] glue_1.6.2 fastmap_1.1.1
## [151] DT_0.22 BiocParallel_1.24.1
## [153] interactiveDisplayBase_1.28.0 codetools_0.2-19
## [155] utf8_1.2.3 lattice_0.21-8
## [157] bslib_0.3.1 tibble_3.2.1
## [159] curl_5.0.0 gtools_3.9.4
## [161] GO.db_3.12.1 openssl_2.0.0
## [163] survival_3.3-1 limma_3.46.0
## [165] rmarkdown_2.13 desc_1.4.2
## [167] munsell_0.5.0 GetoptLong_1.0.5
## [169] GenomeInfoDbData_1.2.4 iterators_1.0.14
## [171] reshape2_1.4.4 gtable_0.3.3
## [173] rbibutils_2.2.8